home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Magnum One
/
Magnum One (Mid-American Digital) (Disc Manufacturing).iso
/
d18
/
kruse_11.arc
/
INDEXSOR.PAS
< prev
next >
Wrap
Pascal/Delphi Source File
|
1990-11-30
|
31KB
|
897 lines
{outline of declaration of subprograms:
1. program IndexText(InText, InIndex, NewIndex, OutIndex, HashFile,
NewHashFile, input, output); (main program)
2. function Lt(u, v: word): Boolean;
3. procedure ReadWord(var f: text; var w: word);
4. procedure WriteWord(var f: text; w: word);
4a. built in CPU time function clock;
5. procedure SplitWords; (phase 1)
5a. function FindFile(ch: char): filecode;
6. function HashAddress(w: word): hashentry;
7. procedure Initialize;
8. procedure GetWord;
8a. procedure TellUserPage;
9. procedure GetChar(var ch: char);
10. procedure AddChar(ch: char);
11. procedure Conclude;
procedure ClassifyWords;
procedure InitializeTable(RefTable: RefHashTable);
function HashAddress(x: reference): integer;
procedure Insert(x: reference; pos: integer; var RefTable: RefHashTable);
procedure Place(var F: fileref; RefTable: RefHashTable);
function Empty(L: list): Boolean;
procedure LinkEntries(RefTable: RefHashTable; var NewList: list);
procedure RemoveFirst(var p: pointer; L: list);
procedure SkipBlank(var F: text);
procedure ReadReference(var r: pointer; var F: text);
procedure WriteReference(p: pointer; var NewIndex, NewHashFile: text);
procedure GetWordType(p: pointer);
procedure Delete(var p: pointer);
procedure CompareAndMerge(NewList: list;var InIndex,NewIndex,NewHashFile: text);
procedure Merge(p, q: pointer; var r: pointer);
procedure Divide(var p, q: pointer);
procedure MergeSort(var p: pointer);
procedure MainMergeSort(var L: list);
}
program IndexText(InText, InIndex, NewIndex, HashFile, NewHashFile,
input, output);
{Produces word counts and list of references for the document file
InText. Uses the master word list in file InIndex, if provided. Output word
list for new text goes to file NewIndex. HashFile contains the common words
to be ignored. If not specified, it is created on output, containing the
words so flagged by the user.}
{This implementation uses only phases 1 and 2. A smaller array of text files
is also used, as specified in the exercise section.}
const
maxwd = 20; {More letters in word will be ignored.}
minwd = 1; {Shorter words will be ignored.}
hashsize = 2003; {should be a prime}
linesperpage = 66; {assumes standard spacing and paper}
maxheight = 20; {for building binary tree in phase 2}
A = 'A';
Z = 'Z';
hyphen = '-';
blank = ' ';
apostrophe = ''''; {requires two `'s to represent one}
underscore = '_';
ordbackspace = 8; {ASCII control character for backspace}
ordformfeed = 12; {ASCII control character for new page}
changecase = 32; {ASCII difference between upper and lower case}
nfiles = 8; {number of temporary files for unprocessed words}
MaxRowLength = 130; {maximum length of output records}
type
word = packed array[1..maxwd] of char;
reference = record
wd: word;
pg: integer; {count or page number}
end;
fileref = file of reference; {used for local files}
letter = A..Z;
hashentry = 1..hashsize;
filecode = 1..nfiles;
var
InText, {document being processed}
InIndex, {master word list}
NewIndex, {word list of current document}
HashFile,
NewHashFile: text;
RefFile: array[filecode] of fileref; {local files used for auxilary
storage of words from phase 1 to phase 2:
Normally, a separate file exist for each initial letter,
this version uses nfiles files due operating system constraints.}
blankword: word; {will contain all blanks}
{The next two variables were originally declared in procedure SplitWords,
they have been moved to this level in order to access them globally.}
outcount: array[filecode] of integer; {counters for word files}
wordcount: integer; {count of all words in the text}
intextname,
inlistname,
newlistname,
newhashname: word; {used to get filename from user}
lastletter: array[filecode] of letter; {last letter in each file}
PresentTime,
StartTime: integer; {used to track CPU time}
RowLength: integer; {ensures records will not exceed MaxRowLength}
function Lt( u, v: word): Boolean;
{Determains if word u precedes word v lexicographically.}
begin
Lt := (u < v)
end;
procedure ReadWord( var F: text; var w: word);
{Reads word w from text file F. Assumes not at end of file.}
{Uses packed array, replace using a loop if your system does not
support packed arrays. }
begin {procedure ReadWord}
read(F, w)
end; {procedure ReadWord}
procedure WriteWord( var F: text; w: word);
{Writes word w to text file F}
{Uses packed array, replace using a loop if your system does not
support packed arrays. }
begin {procedure WriteWord}
write(F, w)
end; {procedure WriteWord}
procedure SetTimer; {Call once at beginning of program execution.}
{Finds the CPU time when called, and keeps in variables for reference.}
{System dependent procedure.}
begin
PresentTime := clock;
StartTime := PresentTime;
end;
function TotalTime: real;
{Returns the total CPU time, in seconds, since call to SetTimer.}
{System dependent procedure.}
begin
TotalTime := (clock - StartTime) / 1000.0;
end;
function ElapsedTime: real;
{Returns elapsed CPU time since last call to function ElapsedTime,
or call to SetTimer, whichever is more recent.}
{System dependent procedure.}
var r: integer;
begin
r := clock;
ElapsedTime := (r - PresentTime) / 1000.0;
PresentTime := r;
end;
procedure SplitWords;
{sets up hash table, reads text, and divides into nfiles word lists}
var
hash: array[hashentry] of reference; {hash table}
pagecount: integer; {keeps the current page number}
addpage: integer; {amount to increase pagecount after word}
linecount: integer; {lines on the current page}
w: word; {word currently being processed}
x: hashentry; {location of w, if in hash table}
endinput: Boolean; {true if and only if input has all been read}
code: filecode; {into which file does word go?}
{The following variables are kept for use in procedure GetWord, and for
efficiency are set up only once in procedure Initialize:}
backspace,
formfeed: char;
alphabet, {letters only - to start a word}
contchar: set of char; {other characters ok in middle of word}
function FindFile( ch: letter): filecode;
{Uses binary decision tree to select one of nfiles = 8 files depending
on the letter ch. These letters must be the same as those in the
global array lastletter .}
begin {function FindFile}
if ch < 'M' then
if ch < 'E' then
if ch < 'C' then FindFile := 1
else FindFile := 2
else if ch < 'H' then FindFile := 3
else FindFile := 4
else if ch < 'S' then
if ch < 'P' then FindFile := 5
else FindFile := 6
else if ch < 'T' then FindFile := 7
else FindFile := 8
end; {function FindFile}
function HashAddress(w: word): hashentry;
{calculates the location in hash table of word w, or, if not there,
returns pointing to the blank word where w should go}
var
x, {calculated location}
inc: integer; {increment for open addressing}
begin {function HashAddress}
x := abs(ord(w[1])*ord(w[2])+ord(w[4])+ord(w[6])) mod hashsize + 1;
{Hash function assumes long word length. For short word machines
we must ensure that the result is non-negative, and worry about overflow.}
if (hash[x].wd <> w) and (hash[x].wd <> blankword) then
begin
inc := (abs(ord(w[3])-95) mod 29);
{A key dependent increment is used to avoid clustering.}
repeat
inc := inc + 1;
if inc > hashsize then
writeln(w,' causes hash table to become full, infinite loop.');
x := x + inc;
if x > hashsize then x := x - hashsize;
until (w = hash[x].wd) or (blankword = hash[x].wd)
end;
HashAddress := x
end; {function HashAddress}
procedure Initialize;
{sets up constant-valued sets for use in GetWord. Opens the text file
and initializes various counters. Opens file holding hash table (if any),
and reads or otherwise initializes table}
var
i: integer; {general purpose loop control}
begin {procedure Initialize}
backspace:= chr(ordbackspace);
formfeed := chr(ordformfeed); {initialize ASCII control characters}
alphabet := ['A'..'Z', 'a'..'z']; {letters only, to start a word}
contchar := [hyphen, apostrophe, backspace, underscore];
{characters which will not terminate word}
for i := 1 to maxwd do
blankword[i] := blank;
write('Name of input text file?');
ReadWord(input, intextname); readln;
open(InText, intextname, readonly);
reset(InText);
endinput := eof(InText);
repeat
write( 'What is the page number on which the text begins?');
readln(pagecount);
if pagecount < 0 then
writeln('Must be a non-negative integer.')
until pagecount >= 0;
linecount := 0;
addpage := 0;
wordcount := 0;
for i := 1 to nfiles do
begin
rewrite( RefFile[i] );
outcount[i] := 0
end;
lastletter[1] := 'B';
lastletter[2] := 'D';
lastletter[3] := 'G';
lastletter[4] := 'L';
lastletter[5] := 'O';
lastletter[6] := 'R';
lastletter[7] := 'S';
lastletter[8] := 'Z';
reset(HashFile); {assumes HASHFILE.DAT is in current directory}
for i := 1 to hashsize do
with hash[i] do
begin
read(HashFile, pg);
get(HashFile); {skip the blank between number and word}
ReadWord(HashFile, wd);
readln(HashFile);
pg := 0; {initialize all the counts to 0}
end;
writeln('The hash table has been read.')
end; {procedure Initialize}
procedure GetWord( var w: word);
{Gets words from input file InText, and returns only words
at least minwd characters long. Parameter endinput becomes
true if and only if the end of InText is reached with no word to return.
the procedure also updates global variables wordcount and linecount,
updates the global variable pagecount after each linesperpage cr's,
or after each formfeed, whichever comes first, and
uses the sets alphabet and contchar and various character constants.}
label 1; {used by GetChar to exit procedure upon eof(InText)}
var c: 0..maxwd; {count of characters in word}
ch: char; {character currently processed}
endln: Boolean; {at the end of a line?}
procedure TellUserPage; {keep the user informed of progress}
var i: integer;
begin
i := pagecount + addpage;
writeln('At page', i:4, ' word count is', wordcount:7)
end;
procedure GetChar(var ch: char);
{gets a character from input text into ch; checks for eof; updates
page count and line count}
begin {procedure GetChar}
if eof(InText) then
if c >= minwd then
ch := '.' {special character to end the current word}
else begin {no word to return; set endinput}
endinput := true;
goto 1 {exit from GetWord.}
end
else begin {not end of file: process next character}
while InText^ in [underscore, backspace] do
get( InText);
ch := InText^;
endln := eoln(InText);
get(InText);
if endln then
begin
linecount := linecount + 1;
if linecount >= linesperpage then
begin
addpage := addpage + 1;
linecount := 0;
TellUserPage
end
end;
if ch = formfeed then
begin
addpage := addpage + 1;
linecount := 0;
TellUserPage;
endln := true; {Treat formfeed like end of line.}
ch := blank
end
end
end; {procedure GetChar}
procedure AddChar(ch: char);
{adds given character to word, if possible}
begin {procedure AddChar}
if c < maxwd then
begin
c := c + 1;
w[c] := ch
end
end; {procedure AddChar}
begin {procedure GetWord}
repeat {until current word is at least minwd chars long}
c := 0;
repeat
GetChar(ch) {Find a letter which will start the word.}
until ch in alphabet;
pagecount := pagecount + addpage;
addpage := 0;
if ch in ['a'..'z'] then {translate first letter to upper case.}
ch := chr(ord(ch) - changecase); {assumes ASCII ordering of letters}
AddChar(ch); {put first letter into the word}
GetChar(ch);
while (ch in alphabet) or (ch in contchar) do
if ch in alphabet then {add letters directly to word}
begin {processing letter}
AddChar(ch);
GetChar(ch)
end {processing letter}
else if ch = hyphen then
begin {processing hyphen}
GetChar(ch); {Find what comes after hyphen.}
if endln then
while ch = ' ' do
GetChar(ch) {Delete both the hyphen and the end of line}
else if ch = hyphen then {Two hyphens form a dash; ends word}
ch := blank {Use a blank to terminate the word.}
else if ch in alphabet then
AddChar(hyphen) {Include other hyphens in word}
else {nothing}
end {processing hyphen}
else if ch = apostrophe then
begin {processing apostrophe}
GetChar(ch);
if ch = 's' then {Delete `'s' at end of word only}
begin
GetChar(ch);
if ch in contchar then
begin
AddChar(apostrophe);
AddChar('s')
end
end
else if ch in alphabet then
AddChar(apostrophe) {Allow contractions.}
end {processing apostrophe}
else {Remaining possibilities are backspace and underscore.}
GetChar(ch); {Delete these characters.}
{While loop on continuing characters ends here.}
wordcount := wordcount + 1
until c >= minwd; {Skip over short words.}
while c < maxwd do {Fill with blanks.}
begin
c := c + 1;
w[c] := blank
end;
1: {When end of file occurs, program will exit to here from GetChar}
end; {procedure GetWord}
procedure Conclude;
{Writes out counts of various word lists. For some systems, it is
necessary to close files, which should be done here.}
var
i,j: integer; {loop index}
response: char; {user's answer to question}
begin {procedure Conclude}
writeln('The total number of words read in is ', wordcount:7);
writeln;
writeln('The number of words to process further in the next stage,');
writeln('in each temporary file, is below.');
writeln(' a-b c-d e-g h-l m-o p-r s t-z');
for i := 1 to nfiles do
write(outcount[i]:8);
writeln;
writeln;
(* not implemented:
repeat
write('Do you wish the counts from hash table to be kept in a file (y,n)?');
readln(response);
if response > 'Z' then response := chr(ord(response)-changecase)
until response in ['N', 'Y'];
if response = 'Y' then
begin
write('Name of file ?');
ReadWord(input, newhashname);
readln;
open(NewHashFile, newhashname);
rewrite(NewHashFile);
for i := 1 to hashsize do
with hash[i] do begin
write(NewHashFile, pg:4, ' ');
j := 1;
repeat
write(NewHashFile, wd[j]);
j := j + 1;
until (wd[j] = ' ') or (j >= maxwd);
writeln(NewHashFile)
end
end *)
end; {procedure Conclude}
begin {procedure SplitWords}
Initialize; {sets up files, hash table, constants}
GetWord(w); {obtain a single word from InText}
while not endinput do
begin
x := HashAddress(w);
if w = hash[x].wd then
hash[x].pg := hash[x].pg + 1
else begin {not in hash table; put into RefFile}
code := FindFile( w[1] );
outcount[code] := outcount[code] + 1;
with RefFile[code]^ do
begin
wd := w;
pg := pagecount
end;
Put(RefFile[code])
end;
GetWord(w)
end;
Conclude {writes word counts to output.}
end; {procedure SplitWords}
{start of phase 2}
procedure ClassifyWords;
{The references stored in the temporary files are placed in a new hash table,
the words from the file InIndex are compared with the words in the new table
as they are merged into the file NewIndex.}
type
wordtype = (hash, count, index); {ways to process a word}
pointref = ^reflist;
reflist = record {list of references}
pg: integer;
next: pointref
end;
pointer = ^node;
node = record {node of list storing wrods.}
wd: word;
kind: wordtype;
ct: integer;
ref: pointref;
next: pointer
end;
{Cannot use varying types as @wordtype is not known upon first reading.}
list = record
head: pointer
end;
var
code: filecode; {loop through temporary files}
NewList: list;
(*=====================================================================*)
function Empty(L: list): Boolean;
begin
Empty := (L.head = nil)
end;
(*=====================================================================*)
procedure Merge(p, q: pointer; var r: pointer);
{Merges two sorted lists into one, that will begin at r;
requires that both lists be non empty. This version is modified
slightly from the version listed in the text due to a difference
in the data structures used.}
var
s: pointer; {always points to last node of sorted list}
begin {procedure Merge}
if (p = nil) or (q = nil) then
writeln('Merge called with empty list(s).');
{First find the head, r, of the merged list.}
if p^.wd <= q^.wd then {change .info.key to .wd}
begin
r := p;
p := p^.next
end
else begin
r := q;
q := q^.next
end;
s := r; {s always points to the last entry of the merged list.}
while (p <> nil) and (q <> nil) do
if p^.wd <= q^.wd then {change .info.key to .wd}
begin
s^.next := p; {Attach the node with the smaller key to the sorted list.}
s := p;
p := p^.next {Advance to the next unmerged node.}
end
else begin
s^.next := q;
s := q;
q := q^.next
end;
{After one list is exhausted, attach the remainder of the other one.}
if p = nil then
s^.next := q
else
s^.next := p
end; {procedure Merge}
(*===========================================================================*)
procedure Divide(var p, q: pointer);
{takes the list to which p points, divides it in half, and returns with
p pointing to head of the first half and q to the head of second half;
requires that the original list contain at least two items, or an
error occurs}
var
r: pointer;
begin {procedure Divide}
q := p; {Start q at position 1, and r at position 3.}
r := p^.next;
r := r^.next;
while r <> nil do {Move r two positions for each move of q.}
begin
r := r^.next;
q := q^.next;
if r <> nil then
r := r^.next
end;
{Break the list into halves after q^.}
r := q^.next;
q^.next := nil;
q := r
end; {procedure Divide}
procedure MergeSort(var p: pointer);
{divides the list starting at p^ in half, sorts it recursively, and merges
the sublists}
var
q: pointer; {marks the halfway point in the list}
begin
if p <> nil then if p^.next <> nil then
begin {Otherwise, list has 0 or 1 entry, with no need to sort.}
Divide(p, q);
MergeSort(p);
MergeSort(q);
Merge(p, q, p)
End
End;
(*===========================================================================*)
procedure MainMergeSort(var L: list);
{ Main procedure to invoke recursive procedure @MergeSort, as listed
in the text. }
begin
MergeSort(L.head)
end;
procedure InitializeList(var L: list);
begin
L.head := nil
end;
procedure Insert(x: reference; var L: list);
{ Inserts the reference into the hash table of references. }
var
done: Boolean;
p: pointer;
q: pointref;
begin {procedure Insert}
done := false;
p := L.head;
while (p <> nil) and (not done) do
begin
if p^.wd = x.wd then
begin
p^.ct := p^.ct + 1;
new(q);
q^.pg := x.pg;
q^.next := p^.ref;
p^.ref := q;
done := true
end
else
p := p^.next
end;
if not done then
begin {Insert a new entry if the word is not already in the table.}
p := nil;
new(p);
p^.wd := x.wd;
p^.ct := 1; {Initialize the count and the page references.}
new(q);
q^.pg := x.pg;
q^.next := nil;
p^.ref := q;
p^.next := L.head;
L.head := p
end
end; {procedure Insert}
procedure Append(p: pointer; var L: list);
var
q: pointer;
begin {procedure Append}
q := L.head;
if q = nil then
L.head := p
else begin
while q^.next <> nil do
q := q^.next;
q^.next := p
end
end; {procedure Append}
procedure Place(var F: fileref; var L: list);
{ Places the words in file @F into the list of words. }
var
x: reference;
temp: pointer;
begin {procedure Place}
temp := L.head;
L.head := nil;
reset(F);
while not eof(F) do
begin
x := F^;
get(F);
Insert(x, L)
end;
MainMergeSort(L);
Append(temp, L)
end; {procedure Place}
procedure RemoveFirst(var p: pointer; var L: list);
{ Removes the first node from the list @L. }
begin
p := L.head;
if not Empty(L) then
begin
L.head := L.head^.next;
p^.next := nil
end
end;
procedure ReadReference(var r: pointer; var F: text);
{ Reads refernce from the file @F. }
var
k: char;
begin {procedure ReadReference}
if eof(F) then
r := nil
else begin
ReadWord(F, r^.wd);
readln(F, k);
case k of
'F', 'f': r^.kind := hash;
'C', 'c': begin
r^.kind := count;
r^.ct := 0
end;
'I', 'i': begin
r^.kind := index;
r^.ref := nil
end
end
end
end; {procedure ReadReference}
procedure WriteReference(p: pointer; var NewIndex, NewHashFile: text);
var
q: pointref;
begin {procedure WriteReference}
with p^ do
case kind of
hash: begin
WriteWord(NewHashFile, wd);
writeln(NewHashFile)
end;
count:begin
WriteWord(NewIndex, wd);
write(NewIndex, 'c');
writeln(NewIndex, ct:5)
end;
index:begin
WriteWord(NewIndex, wd);
write(NewIndex, 'i');
q := ref;
while q <> nil do
begin
write(NewIndex, q^.pg:5);
q := q^.next
end;
writeln(NewIndex)
end
end
end; {procedure WriteReference}
procedure GetWordType(p: pointer);
{ Request the user to specify the category of the given word. }
var
response: char;
begin {procedure GetWordType}
with p^ do
begin
repeat
WriteWord(output, wd);
write(' is (F, C, I)?');
readln(response)
until response in ['F', 'f', 'C', 'c', 'I', 'i'];
case response of
'F', 'f': kind := hash;
'C', 'c': kind := count;
'I', 'i': kind := index
end
end
end; {procedure GetWordType}
procedure Delete(var p: pointer);
{ Delete the word @p^ as well as all of the page references associated with it. }
var
q, r: pointref;
begin {procedure Delete}
if p^.kind = index then
begin
q := p^.ref;
while q <> nil do
begin {dispose the reference list}
r := q^.next;
dispose(q);
p^.ref := r;
q := r
end
end;
dispose(p) {dispose the node itself}
end; {procedure Delete}
procedure CompareAndMerge(var L: list; var InIndex, NewIndex, NewHashFile: text);
{ Compare the list @NewList with @InIndex, merge if was found. }
var
p, r: pointer;
begin {procedure CompareAndMerge}
RemoveFirst(p, L);
new(r);
ReadReference(r, InIndex);
while (p <> nil) do
if r = nil then
begin
GetWordType(p);
WriteReference(p, NewIndex, NewHashFile);
Delete(p); {Remove reference list and node from memory.}
RemoveFirst(p, L)
end
else if p^.wd < r^.wd then
begin
GetWordType(p);
WriteReference(p, NewIndex, NewHashFile);
Delete(p); {Remove reference list and node from memory.}
RemoveFirst(p, L)
end
else if p^.wd > r^.wd then {do not write word not used to NewIndex}
ReadReference(r, InIndex)
else begin {p^.wd = r^.wd}
p^.kind := r^.kind;
WriteReference(p, NewIndex, NewHashFile);
Delete(p); {Remove reference list and node from memory.}
RemoveFirst(p, L);
ReadReference(r, InIndex)
end
end; {procedure CompareAndMerge}
begin {procedure ClassifyWords}
write('Name of input word list ?');
ReadWord(input, inlistname);
readln;
open(InIndex, inlistname, readonly); {may vary on different systems}
reset(InIndex);
write('Name of output word list ?');
ReadWord(input, newlistname);
readln;
open(NewIndex, newlistname); {may vary on different systems}
rewrite(NewIndex);
write('Name of file for new hash words ?');
ReadWord(input, newhashname);
readln;
open(NewHashFile, newhashname); {may vary on different systems}
rewrite(NewHashFile);
InitializeList(NewList);
for code := nfiles downto 1 do
Place(RefFile[code], NewList);
if not Empty(NewList) then
CompareAndMerge(NewList, InIndex, NewIndex, NewHashFile);
close(InIndex); {may vary on different systems}
close(NewIndex);
close(NewHashFile)
end; {procedure ClassifyWords}
begin {main program}
SetTimer;
SplitWords; {Phase 1}
writeln('Time in first phase is ', ElapsedTime:7:1, ' seconds.');
writeln;
ClassifyWords; {Phase 2}
writeln('Time in second phase is', ElapsedTime:7:1, ' seconds.');
writeln;
writeln('Processing of input document ', intextname, ' is complete.');
writeln('Total time in program was ', TotalTime:7:1, ' seconds.')
end.